{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-04T22:13:49.000232Z",
     "start_time": "2018-07-04T22:13:48.989118Z"
    }
   },
   "source": [
    "This notebook presents the pre-processing of the S&P 500 data used to produce the input tensors of the neural network. <br>\n",
    "For each stock, the input is a raw time series of the prices (High, Low, Open, Close). \n",
    "The output is a matrix of 4 rows and n (number of available data points) columns. <br>\n",
    "The columns correspond to:\n",
    "- Close(t-1)/Open(t-1)\n",
    "- High(t-1)/Open(t-1)\n",
    "- Low(t-1)/Open(t-1)\n",
    "- Open(t)/Open(t-1)\n",
    "    \n",
    "<u>Remark:</u> We don't need to normalize the data since it's already of ratio of 2 prices closed to one. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-02T21:36:22.007862Z",
     "start_time": "2018-05-02T21:36:19.154744Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Selim/anaconda/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.figure.Figure at 0x10cd6f8d0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#for navigation in the folders\n",
    "import os\n",
    "import pathlib\n",
    "\n",
    "from time import strptime\n",
    "from datetime import datetime\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "#for plot\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.image as mpimg\n",
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns \n",
    "import PIL\n",
    "import pickle\n",
    "from time import strftime\n",
    "\n",
    "import seaborn as sns\n",
    "sns.despine()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-28T22:11:59.059202Z",
     "start_time": "2018-04-28T22:11:59.056226Z"
    }
   },
   "source": [
    "# Processing "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-02T21:36:22.033094Z",
     "start_time": "2018-05-02T21:36:22.019840Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "There are 505 different stocks.\n"
     ]
    }
   ],
   "source": [
    "data_dir = '/individual_stocks_5yr/'\n",
    "directory = os.getcwd() + data_dir # path to the files\n",
    "files_tags = os.listdir(directory) #these are the differents pdf files\n",
    "\n",
    "#this is here because hidden files are also shown in the list. \n",
    "for file in files_tags:\n",
    "    if file[0] == '.':\n",
    "        files_tags.remove(file)\n",
    "stock_name = [file.split('_')[0] for file in files_tags]\n",
    "stocks = [file for file in files_tags]\n",
    "print(len(stock_name) == len(stocks))\n",
    "print('There are {} different stocks.'.format(len(stock_name)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-02T21:36:24.183642Z",
     "start_time": "2018-05-02T21:36:22.044990Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 505/505 [00:02<00:00, 238.36it/s]\n"
     ]
    }
   ],
   "source": [
    "kept_stocks = list()\n",
    "not_kept_stocks = list()\n",
    "\n",
    "for s in tqdm(stocks):\n",
    "    df = pd.read_csv(os.getcwd() + data_dir + s)\n",
    "        \n",
    "    if len(df)!=1259:\n",
    "        \n",
    "        not_kept_stocks.append(s)\n",
    "    else:\n",
    "        kept_stocks.append(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-02T21:36:24.222065Z",
     "start_time": "2018-05-02T21:36:24.217244Z"
    }
   },
   "outputs": [],
   "source": [
    "kept_stock_rl = [kept_stocks[3], kept_stocks[7], kept_stocks[12], kept_stocks[37], kept_stocks[42]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-02T21:36:24.324650Z",
     "start_time": "2018-05-02T21:36:24.252411Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 146.31it/s]\n"
     ]
    }
   ],
   "source": [
    "list_open = list()\n",
    "list_close = list()\n",
    "list_high = list()\n",
    "list_low = list()\n",
    "\n",
    "for s in tqdm(kept_stock_rl):\n",
    "    data = pd.read_csv(os.getcwd() + data_dir + s).fillna('bfill').copy()\n",
    "    data = data[['open', 'close', 'high', 'low']]\n",
    "    list_open.append(data.open.values)\n",
    "    list_close.append(data.close.values)\n",
    "    list_high.append(data.high.values)\n",
    "    list_low.append(data.low.values)\n",
    "\n",
    "array_open = np.transpose(np.array(list_open))[:-1]\n",
    "array_open_of_the_day = np.transpose(np.array(list_open))[1:]\n",
    "array_close = np.transpose(np.array(list_close))[:-1]\n",
    "array_high = np.transpose(np.array(list_high))[:-1]\n",
    "array_low = np.transpose(np.array(list_low))[:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-02T21:36:24.611014Z",
     "start_time": "2018-05-02T21:36:24.599197Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 5, 1258)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = np.transpose(np.array([array_close/array_open, \n",
    "                           array_high/array_open,\n",
    "                           array_low/array_open,\n",
    "                           array_open_of_the_day/array_open]), axes= (0,2,1))\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-04T22:20:34.439882Z",
     "start_time": "2018-07-04T22:20:34.422446Z"
    }
   },
   "source": [
    "The shape corresponds to:\n",
    "- 4: Number of features\n",
    "- 5: Number of stocks we want to study\n",
    "- 17030: Number of data points"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-01T19:51:39.295121Z",
     "start_time": "2018-05-01T19:51:39.287613Z"
    }
   },
   "outputs": [],
   "source": [
    "np.save('./np_data/input.npy', X)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
